etree_lxml.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. """Module for supporting the lxml.etree library. The idea here is to use as much
  2. of the native library as possible, without using fragile hacks like custom element
  3. names that break between releases. The downside of this is that we cannot represent
  4. all possible trees; specifically the following are known to cause problems:
  5. Text or comments as siblings of the root element
  6. Docypes with no name
  7. When any of these things occur, we emit a DataLossWarning
  8. """
  9. from __future__ import absolute_import, division, unicode_literals
  10. # pylint:disable=protected-access
  11. import warnings
  12. import re
  13. import sys
  14. try:
  15. from collections.abc import MutableMapping
  16. except ImportError:
  17. from collections import MutableMapping
  18. from . import base
  19. from ..constants import DataLossWarning
  20. from .. import constants
  21. from . import etree as etree_builders
  22. from .. import _ihatexml
  23. import lxml.etree as etree
  24. from pip._vendor.six import PY3, binary_type
  25. fullTree = True
  26. tag_regexp = re.compile("{([^}]*)}(.*)")
  27. comment_type = etree.Comment("asd").tag
  28. class DocumentType(object):
  29. def __init__(self, name, publicId, systemId):
  30. self.name = name
  31. self.publicId = publicId
  32. self.systemId = systemId
  33. class Document(object):
  34. def __init__(self):
  35. self._elementTree = None
  36. self._childNodes = []
  37. def appendChild(self, element):
  38. last = self._elementTree.getroot()
  39. for last in self._elementTree.getroot().itersiblings():
  40. pass
  41. last.addnext(element._element)
  42. def _getChildNodes(self):
  43. return self._childNodes
  44. childNodes = property(_getChildNodes)
  45. def testSerializer(element):
  46. rv = []
  47. infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
  48. def serializeElement(element, indent=0):
  49. if not hasattr(element, "tag"):
  50. if hasattr(element, "getroot"):
  51. # Full tree case
  52. rv.append("#document")
  53. if element.docinfo.internalDTD:
  54. if not (element.docinfo.public_id or
  55. element.docinfo.system_url):
  56. dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
  57. else:
  58. dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
  59. element.docinfo.root_name,
  60. element.docinfo.public_id,
  61. element.docinfo.system_url)
  62. rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
  63. next_element = element.getroot()
  64. while next_element.getprevious() is not None:
  65. next_element = next_element.getprevious()
  66. while next_element is not None:
  67. serializeElement(next_element, indent + 2)
  68. next_element = next_element.getnext()
  69. elif isinstance(element, str) or isinstance(element, bytes):
  70. # Text in a fragment
  71. assert isinstance(element, str) or sys.version_info[0] == 2
  72. rv.append("|%s\"%s\"" % (' ' * indent, element))
  73. else:
  74. # Fragment case
  75. rv.append("#document-fragment")
  76. for next_element in element:
  77. serializeElement(next_element, indent + 2)
  78. elif element.tag == comment_type:
  79. rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
  80. if hasattr(element, "tail") and element.tail:
  81. rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
  82. else:
  83. assert isinstance(element, etree._Element)
  84. nsmatch = etree_builders.tag_regexp.match(element.tag)
  85. if nsmatch is not None:
  86. ns = nsmatch.group(1)
  87. tag = nsmatch.group(2)
  88. prefix = constants.prefixes[ns]
  89. rv.append("|%s<%s %s>" % (' ' * indent, prefix,
  90. infosetFilter.fromXmlName(tag)))
  91. else:
  92. rv.append("|%s<%s>" % (' ' * indent,
  93. infosetFilter.fromXmlName(element.tag)))
  94. if hasattr(element, "attrib"):
  95. attributes = []
  96. for name, value in element.attrib.items():
  97. nsmatch = tag_regexp.match(name)
  98. if nsmatch is not None:
  99. ns, name = nsmatch.groups()
  100. name = infosetFilter.fromXmlName(name)
  101. prefix = constants.prefixes[ns]
  102. attr_string = "%s %s" % (prefix, name)
  103. else:
  104. attr_string = infosetFilter.fromXmlName(name)
  105. attributes.append((attr_string, value))
  106. for name, value in sorted(attributes):
  107. rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
  108. if element.text:
  109. rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
  110. indent += 2
  111. for child in element:
  112. serializeElement(child, indent)
  113. if hasattr(element, "tail") and element.tail:
  114. rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
  115. serializeElement(element, 0)
  116. return "\n".join(rv)
  117. def tostring(element):
  118. """Serialize an element and its child nodes to a string"""
  119. rv = []
  120. def serializeElement(element):
  121. if not hasattr(element, "tag"):
  122. if element.docinfo.internalDTD:
  123. if element.docinfo.doctype:
  124. dtd_str = element.docinfo.doctype
  125. else:
  126. dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
  127. rv.append(dtd_str)
  128. serializeElement(element.getroot())
  129. elif element.tag == comment_type:
  130. rv.append("<!--%s-->" % (element.text,))
  131. else:
  132. # This is assumed to be an ordinary element
  133. if not element.attrib:
  134. rv.append("<%s>" % (element.tag,))
  135. else:
  136. attr = " ".join(["%s=\"%s\"" % (name, value)
  137. for name, value in element.attrib.items()])
  138. rv.append("<%s %s>" % (element.tag, attr))
  139. if element.text:
  140. rv.append(element.text)
  141. for child in element:
  142. serializeElement(child)
  143. rv.append("</%s>" % (element.tag,))
  144. if hasattr(element, "tail") and element.tail:
  145. rv.append(element.tail)
  146. serializeElement(element)
  147. return "".join(rv)
  148. class TreeBuilder(base.TreeBuilder):
  149. documentClass = Document
  150. doctypeClass = DocumentType
  151. elementClass = None
  152. commentClass = None
  153. fragmentClass = Document
  154. implementation = etree
  155. def __init__(self, namespaceHTMLElements, fullTree=False):
  156. builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
  157. infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
  158. self.namespaceHTMLElements = namespaceHTMLElements
  159. class Attributes(MutableMapping):
  160. def __init__(self, element):
  161. self._element = element
  162. def _coerceKey(self, key):
  163. if isinstance(key, tuple):
  164. name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
  165. else:
  166. name = infosetFilter.coerceAttribute(key)
  167. return name
  168. def __getitem__(self, key):
  169. value = self._element._element.attrib[self._coerceKey(key)]
  170. if not PY3 and isinstance(value, binary_type):
  171. value = value.decode("ascii")
  172. return value
  173. def __setitem__(self, key, value):
  174. self._element._element.attrib[self._coerceKey(key)] = value
  175. def __delitem__(self, key):
  176. del self._element._element.attrib[self._coerceKey(key)]
  177. def __iter__(self):
  178. return iter(self._element._element.attrib)
  179. def __len__(self):
  180. return len(self._element._element.attrib)
  181. def clear(self):
  182. return self._element._element.attrib.clear()
  183. class Element(builder.Element):
  184. def __init__(self, name, namespace):
  185. name = infosetFilter.coerceElement(name)
  186. builder.Element.__init__(self, name, namespace=namespace)
  187. self._attributes = Attributes(self)
  188. def _setName(self, name):
  189. self._name = infosetFilter.coerceElement(name)
  190. self._element.tag = self._getETreeTag(
  191. self._name, self._namespace)
  192. def _getName(self):
  193. return infosetFilter.fromXmlName(self._name)
  194. name = property(_getName, _setName)
  195. def _getAttributes(self):
  196. return self._attributes
  197. def _setAttributes(self, value):
  198. attributes = self.attributes
  199. attributes.clear()
  200. attributes.update(value)
  201. attributes = property(_getAttributes, _setAttributes)
  202. def insertText(self, data, insertBefore=None):
  203. data = infosetFilter.coerceCharacters(data)
  204. builder.Element.insertText(self, data, insertBefore)
  205. def cloneNode(self):
  206. element = type(self)(self.name, self.namespace)
  207. if self._element.attrib:
  208. element._element.attrib.update(self._element.attrib)
  209. return element
  210. class Comment(builder.Comment):
  211. def __init__(self, data):
  212. data = infosetFilter.coerceComment(data)
  213. builder.Comment.__init__(self, data)
  214. def _setData(self, data):
  215. data = infosetFilter.coerceComment(data)
  216. self._element.text = data
  217. def _getData(self):
  218. return self._element.text
  219. data = property(_getData, _setData)
  220. self.elementClass = Element
  221. self.commentClass = Comment
  222. # self.fragmentClass = builder.DocumentFragment
  223. base.TreeBuilder.__init__(self, namespaceHTMLElements)
  224. def reset(self):
  225. base.TreeBuilder.reset(self)
  226. self.insertComment = self.insertCommentInitial
  227. self.initial_comments = []
  228. self.doctype = None
  229. def testSerializer(self, element):
  230. return testSerializer(element)
  231. def getDocument(self):
  232. if fullTree:
  233. return self.document._elementTree
  234. else:
  235. return self.document._elementTree.getroot()
  236. def getFragment(self):
  237. fragment = []
  238. element = self.openElements[0]._element
  239. if element.text:
  240. fragment.append(element.text)
  241. fragment.extend(list(element))
  242. if element.tail:
  243. fragment.append(element.tail)
  244. return fragment
  245. def insertDoctype(self, token):
  246. name = token["name"]
  247. publicId = token["publicId"]
  248. systemId = token["systemId"]
  249. if not name:
  250. warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
  251. self.doctype = None
  252. else:
  253. coercedName = self.infosetFilter.coerceElement(name)
  254. if coercedName != name:
  255. warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
  256. doctype = self.doctypeClass(coercedName, publicId, systemId)
  257. self.doctype = doctype
  258. def insertCommentInitial(self, data, parent=None):
  259. assert parent is None or parent is self.document
  260. assert self.document._elementTree is None
  261. self.initial_comments.append(data)
  262. def insertCommentMain(self, data, parent=None):
  263. if (parent == self.document and
  264. self.document._elementTree.getroot()[-1].tag == comment_type):
  265. warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
  266. super(TreeBuilder, self).insertComment(data, parent)
  267. def insertRoot(self, token):
  268. # Because of the way libxml2 works, it doesn't seem to be possible to
  269. # alter information like the doctype after the tree has been parsed.
  270. # Therefore we need to use the built-in parser to create our initial
  271. # tree, after which we can add elements like normal
  272. docStr = ""
  273. if self.doctype:
  274. assert self.doctype.name
  275. docStr += "<!DOCTYPE %s" % self.doctype.name
  276. if (self.doctype.publicId is not None or
  277. self.doctype.systemId is not None):
  278. docStr += (' PUBLIC "%s" ' %
  279. (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
  280. if self.doctype.systemId:
  281. sysid = self.doctype.systemId
  282. if sysid.find("'") >= 0 and sysid.find('"') >= 0:
  283. warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
  284. sysid = sysid.replace("'", 'U00027')
  285. if sysid.find("'") >= 0:
  286. docStr += '"%s"' % sysid
  287. else:
  288. docStr += "'%s'" % sysid
  289. else:
  290. docStr += "''"
  291. docStr += ">"
  292. if self.doctype.name != token["name"]:
  293. warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
  294. docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
  295. root = etree.fromstring(docStr)
  296. # Append the initial comments:
  297. for comment_token in self.initial_comments:
  298. comment = self.commentClass(comment_token["data"])
  299. root.addprevious(comment._element)
  300. # Create the root document and add the ElementTree to it
  301. self.document = self.documentClass()
  302. self.document._elementTree = root.getroottree()
  303. # Give the root element the right name
  304. name = token["name"]
  305. namespace = token.get("namespace", self.defaultNamespace)
  306. if namespace is None:
  307. etree_tag = name
  308. else:
  309. etree_tag = "{%s}%s" % (namespace, name)
  310. root.tag = etree_tag
  311. # Add the root element to the internal child/open data structures
  312. root_element = self.elementClass(name, namespace)
  313. root_element._element = root
  314. self.document._childNodes.append(root_element)
  315. self.openElements.append(root_element)
  316. # Reset to the default insert comment function
  317. self.insertComment = self.insertCommentMain